library(readr)
library(magrittr)
library(plotly)
library(dplyr)
library(NbClust)
library(factoextra)
library(tidyr)
library(stringr)
library(ClusterR)
library(dendextend)
library(stringr)
library(tidyr)
library(NbClust)
library(clValid)
library(clusterSim)
library(data.table)
#Task-1
Data1 <- read_csv("Data1.csv")
#K-means clustering
km1 <- kmeans(Data1[,2:4], 7, nstart=30)
kmf <- cbind(Data1[,2:4],Class=km1$cluster)
#Confusion matrix - Kmeans
table(Data1$Class,kmf$Class)
1 2 3 4 5 6 7
1 0 0 0 0 0 0 32
2 0 0 30 0 0 0 0
3 0 0 0 0 30 0 0
4 0 0 0 30 0 0 0
5 30 0 0 0 0 0 0
6 0 30 0 0 0 0 0
7 0 0 0 0 0 30 0
#Hierarchical clustering
hc1 <- hclust(dist(Data1[,2:4]), method = "single")
dd1 <- as.dendrogram(hc1)
dd1 <- color_branches(dd1, k=7)
#dd1 <- set(dd1, "labels_cex", 0.3)
plot(dd1)
hc1_Class <- cutree(hc1,7)
#Confusion matrix - Hierarchical
table(Data1$Class,hc1_Class)
hc1_Class
1 2 3 4 5 6 7
1 32 0 0 0 0 0 0
2 0 30 0 0 0 0 0
3 0 0 30 0 0 0 0
4 0 0 0 30 0 0 0
5 0 0 0 0 30 0 0
6 0 0 0 0 0 30 0
7 0 0 0 0 0 0 30
#Plotting the original class
plot1<-plot_ly(x=kmf$X1, y=kmf$X2, z=kmf$X3, type="scatter3d", mode="markers", color=as.factor(Data1$Class)) %>%
layout(title = 'Plot with data points colored based on original class')
plot1
#Plotting the K-means class
plot2<-plot_ly(x=kmf$X1, y=kmf$X2, z=kmf$X3, type="scatter3d", mode="markers", color=as.factor(kmf$Class)) %>%
layout(title = 'Plot with data points colored based on K-Means output')
plot2
#Plotting the Hierarchical class
plot2<-plot_ly(x=kmf$X1, y=kmf$X2, z=kmf$X3, type="scatter3d", mode="markers", color=as.factor(hc1_Class)) %>%
layout(title = 'Plot with data points colored based on Hierarchical output')
plot2
#External Validations
kmeans_validation <- external_validation(Data1$Class,kmf$Class,method = "adjusted_rand_index",
summary_stats = TRUE)
----------------------------------------
purity : 1
entropy : 0
normalized mutual information : 1
variation of information : 0
normalized var. of information : 0
----------------------------------------
specificity : 1
sensitivity : 1
precision : 1
recall : 1
F-measure : 1
----------------------------------------
accuracy OR rand-index : 1
adjusted-rand-index : 1
jaccard-index : 1
fowlkes-mallows-index : 1
mirkin-metric : 0
----------------------------------------
hierarchical_validation <- external_validation(Data1$Class,hc1_Class,method = "adjusted_rand_index",
summary_stats = TRUE)
----------------------------------------
purity : 1
entropy : 0
normalized mutual information : 1
variation of information : 0
normalized var. of information : 0
----------------------------------------
specificity : 1
sensitivity : 1
precision : 1
recall : 1
F-measure : 1
----------------------------------------
accuracy OR rand-index : 1
adjusted-rand-index : 1
jaccard-index : 1
fowlkes-mallows-index : 1
mirkin-metric : 0
----------------------------------------
kmeans_validation
[1] 1
hierarchical_validation
[1] 1
Data2 <- read_csv("Data2.csv")
#K-means clustering
km1 <- kmeans(Data2[,2:4], 4, nstart=30)
kmf <- cbind(Data2[,2:4],Class=km1$cluster)
#Confusion matrix - Kmeans
table(Data2$Class,kmf$Class)
1 2 3 4
1 0 117 83 0
2 76 24 0 0
3 0 0 0 100
4 4 0 0 0
#Hierarchical clustering(centroid linkage works better)
hc1 <- hclust(dist(Data2[,2:4]), method = "centroid")
dd1 <- as.dendrogram(hc1)
dd1 <- color_branches(dd1, k=4)
#dd1 <- set(dd1, "labels_cex", 0.3)
plot(dd1)
hc1_Class <- cutree(hc1,4)
#Confusion matrix - Hierarchical
table(Data2$Class,hc1_Class)
hc1_Class
1 2 3 4
1 200 0 0 0
2 0 100 0 0
3 0 0 100 0
4 0 0 0 4
#Plotting the original class
plot1<-plot_ly(x=kmf$X, y=kmf$Y, z=kmf$C, type="scatter3d", mode="markers", color=as.factor(Data2$Class)) %>%
layout(title = 'Plot with data points colored based on original class')
plot1
#Plotting the K-means class
plot2<-plot_ly(x=kmf$X, y=kmf$Y, z=kmf$C, type="scatter3d", mode="markers", color=as.factor(kmf$Class)) %>%
layout(title = 'Plot with data points colored based on K-Means output')
plot2
#Plotting the Hierarchical class
plot2<-plot_ly(x=kmf$X, y=kmf$Y, z=kmf$C, type="scatter3d", mode="markers", color=as.factor(hc1_Class)) %>%
layout(title = 'Plot with data points colored based on Hierarchical output')
plot2
#External Validations
kmeans_validation <- external_validation(Data2$Class,kmf$Class,method = "adjusted_rand_index",
summary_stats = TRUE)
----------------------------------------
purity : 0.9307
entropy : 0.3407
normalized mutual information : 0.7255
variation of information : 0.9679
normalized var. of information : 0.4308
----------------------------------------
specificity : 0.9397
sensitivity : 0.613
precision : 0.8545
recall : 0.613
F-measure : 0.7139
----------------------------------------
accuracy OR rand-index : 0.8201
adjusted-rand-index : 0.5878
jaccard-index : 0.555
fowlkes-mallows-index : 0.7237
mirkin-metric : 29294
----------------------------------------
hierarchical_validation <- external_validation(Data2$Class,hc1_Class,method = "adjusted_rand_index",
summary_stats = TRUE)
----------------------------------------
purity : 1
entropy : 0
normalized mutual information : 1
variation of information : 0
normalized var. of information : 0
----------------------------------------
specificity : 1
sensitivity : 1
precision : 1
recall : 1
F-measure : 1
----------------------------------------
accuracy OR rand-index : 1
adjusted-rand-index : 1
jaccard-index : 1
fowlkes-mallows-index : 1
mirkin-metric : 0
----------------------------------------
kmeans_validation
[1] 0.5877644
hierarchical_validation
[1] 1
#Complete linkage works better for this one
Data3 <- read_csv("Data3.csv")
#K-means clustering
km1 <- kmeans(Data3[,2:4], 4, nstart=30)
kmf <- cbind(Data3[,2:4],Class=km1$cluster)
#Confusion matrix - Kmeans
table(Data3$Class,kmf$Class)
1 2 3 4
1 100 0 0 0
2 0 0 100 0
3 0 100 0 0
4 0 0 0 100
#Hierarchical clustering(complete linkage works better)
hc1 <- hclust(dist(Data3[,2:4]), method = "complete")
dd1 <- as.dendrogram(hc1)
dd1 <- color_branches(dd1, k=4)
#dd1 <- set(dd1, "labels_cex", 0.3)
plot(dd1)
hc1_Class <- cutree(hc1,4)
#Confusion matrix - Hierarchical
table(Data3$Class,hc1_Class)
hc1_Class
1 2 3 4
1 100 0 0 0
2 0 100 0 0
3 0 0 98 2
4 0 0 0 100
#Plotting the original class
plot1<-plot_ly(x=kmf$X1, y=kmf$X2, z=kmf$X3, type="scatter3d", mode="markers", color=as.factor(Data3$Class)) %>%
layout(title = 'Plot with data points colored based on original class')
plot1
#Plotting the K-means class
plot2<-plot_ly(x=kmf$X1, y=kmf$X2, z=kmf$X3, type="scatter3d", mode="markers", color=as.factor(kmf$Class)) %>%
layout(title = 'Plot with data points colored based on K-Means output')
plot2
#Plotting the Hierarchical class
plot2<-plot_ly(x=kmf$X1, y=kmf$X2, z=kmf$X3, type="scatter3d", mode="markers", color=as.factor(hc1_Class)) %>%
layout(title = 'Plot with data points colored based on Hierarchical output')
plot2
#External Validations
kmeans_validation <- external_validation(Data3$Class,kmf$Class,method = "adjusted_rand_index",
summary_stats = TRUE)
----------------------------------------
purity : 1
entropy : 0
normalized mutual information : 1
variation of information : 0
normalized var. of information : 0
----------------------------------------
specificity : 1
sensitivity : 1
precision : 1
recall : 1
F-measure : 1
----------------------------------------
accuracy OR rand-index : 1
adjusted-rand-index : 1
jaccard-index : 1
fowlkes-mallows-index : 1
mirkin-metric : 0
----------------------------------------
hierarchical_validation <- external_validation(Data3$Class,hc1_Class,method = "adjusted_rand_index",
summary_stats = TRUE)
----------------------------------------
purity : 0.995
entropy : 0.0177
normalized mutual information : 0.9823
variation of information : 0.0709
normalized var. of information : 0.0348
----------------------------------------
specificity : 0.9967
sensitivity : 0.9901
precision : 0.9899
recall : 0.9901
F-measure : 0.99
----------------------------------------
accuracy OR rand-index : 0.995
adjusted-rand-index : 0.9867
jaccard-index : 0.9802
fowlkes-mallows-index : 0.99
mirkin-metric : 792
----------------------------------------
kmeans_validation
[1] 1
hierarchical_validation
[1] 0.9867009
Data4 <- read_csv("Data4.csv")
#K-means clustering
km1 <- kmeans(Data4[,2:4], 2, nstart=30)
kmf <- cbind(Data4[,2:4],Class=km1$cluster)
#Confusion matrix - Kmeans
table(Data4$Class,kmf$Class)
1 2
1 327 173
2 174 326
#Hierarchical clustering
hc1 <- hclust(dist(Data4[,2:4]), method = "single")
dd1 <- as.dendrogram(hc1)
dd1 <- color_branches(dd1, k=2)
#dd1 <- set(dd1, "labels_cex", 0.3)
plot(dd1)
hc1_Class <- cutree(hc1,2)
#Confusion matrix - Hierarchical
table(Data4$Class,hc1_Class)
hc1_Class
1 2
1 500 0
2 0 500
#Plotting the original class
plot1<-plot_ly(x=kmf$X1, y=kmf$X2, z=kmf$X3, type="scatter3d", mode="markers", color=as.factor(Data4$Class)) %>%
layout(title = 'Plot with data points colored based on original class')
plot1
#Plotting the K-means class
plot2<-plot_ly(x=kmf$X1, y=kmf$X2, z=kmf$X3, type="scatter3d", mode="markers", color=as.factor(kmf$Class)) %>%
layout(title = 'Plot with data points colored based on K-Means output')
plot2
#Plotting the Hierarchical class
plot2<-plot_ly(x=kmf$X1, y=kmf$X2, z=kmf$X3, type="scatter3d", mode="markers", color=as.factor(hc1_Class)) %>%
layout(title = 'Plot with data points colored based on Hierarchical output')
plot2
#External Validations
kmeans_validation <- external_validation(Data4$Class,kmf$Class,method = "adjusted_rand_index",
summary_stats = TRUE)
----------------------------------------
purity : 0.653
entropy : 0.9314
normalized mutual information : 0.0686
variation of information : 1.8627
normalized var. of information : 0.9645
----------------------------------------
specificity : 0.5468
sensitivity : 0.5459
precision : 0.5459
recall : 0.5459
F-measure : 0.5459
----------------------------------------
accuracy OR rand-index : 0.5464
adjusted-rand-index : 0.0927
jaccard-index : 0.3754
fowlkes-mallows-index : 0.5459
mirkin-metric : 453182
----------------------------------------
hierarchical_validation <- external_validation(Data4$Class,hc1_Class,method = "adjusted_rand_index",
summary_stats = TRUE)
----------------------------------------
purity : 1
entropy : 0
normalized mutual information : 1
variation of information : 0
normalized var. of information : 0
----------------------------------------
specificity : 1
sensitivity : 1
precision : 1
recall : 1
F-measure : 1
----------------------------------------
accuracy OR rand-index : 1
adjusted-rand-index : 1
jaccard-index : 1
fowlkes-mallows-index : 1
mirkin-metric : 0
----------------------------------------
kmeans_validation
[1] 0.09272782
hierarchical_validation
[1] 1
Data5 <- read_csv("Data5.csv")
#K-means clustering
km1 <- kmeans(Data5[,2:4], 2, nstart=30)
kmf <- cbind(Data5[,2:4],Class=km1$cluster)
#Confusion matrix - Kmeans
table(Data5$Class,kmf$Class)
1 2
1 174 226
2 0 400
#Hierarchical clustering
hc1 <- hclust(dist(Data5[,2:4]), method = "single")
dd1 <- as.dendrogram(hc1)
dd1 <- color_branches(dd1, k=2)
#dd1 <- set(dd1, "labels_cex", 0.3)
plot(dd1)
hc1_Class <- cutree(hc1,2)
#Confusion matrix - Hierarchical
table(Data5$Class,hc1_Class)
hc1_Class
1 2
1 400 0
2 0 400
#Plotting the original class
plot1<-plot_ly(x=kmf$X1, y=kmf$X2, z=kmf$X3, type="scatter3d", mode="markers", color=as.factor(Data5$Class)) %>%
layout(title = 'Plot with data points colored based on original class')
plot1
#Plotting the K-means class
plot2<-plot_ly(x=kmf$X1, y=kmf$X2, z=kmf$X3, type="scatter3d", mode="markers", color=as.factor(kmf$Class)) %>%
layout(title = 'Plot with data points colored based on K-Means output')
plot2
#Plotting the Hierarchical class
plot2<-plot_ly(x=kmf$X1, y=kmf$X2, z=kmf$X3, type="scatter3d", mode="markers", color=as.factor(hc1_Class)) %>%
layout(title = 'Plot with data points colored based on Hierarchical output')
plot2
#External Validations
kmeans_validation <- external_validation(Data5$Class,kmf$Class,method = "adjusted_rand_index",
summary_stats = TRUE)
----------------------------------------
purity : 0.7175
entropy : 0.4939
normalized mutual information : 0.2981
variation of information : 1.2322
normalized var. of information : 0.8248
----------------------------------------
specificity : 0.435
sensitivity : 0.7536
precision : 0.5709
recall : 0.7536
F-measure : 0.6497
----------------------------------------
accuracy OR rand-index : 0.5941
adjusted-rand-index : 0.1885
jaccard-index : 0.4811
fowlkes-mallows-index : 0.6559
mirkin-metric : 259448
----------------------------------------
hierarchical_validation <- external_validation(Data5$Class,hc1_Class,method = "adjusted_rand_index",
summary_stats = TRUE)
----------------------------------------
purity : 1
entropy : 0
normalized mutual information : 1
variation of information : 0
normalized var. of information : 0
----------------------------------------
specificity : 1
sensitivity : 1
precision : 1
recall : 1
F-measure : 1
----------------------------------------
accuracy OR rand-index : 1
adjusted-rand-index : 1
jaccard-index : 1
fowlkes-mallows-index : 1
mirkin-metric : 0
----------------------------------------
kmeans_validation
[1] 0.1885336
hierarchical_validation
[1] 1
Data6 <- read_csv("Data6.csv")
#K-means clustering
km1 <- kmeans(Data6[,2:3], 2, nstart=30)
kmf <- cbind(Data6[,2:3],Class=km1$cluster)
#Confusion matrix - Kmeans
table(Data6$Class,kmf$Class)
1 2
1 46 2002
2 1895 153
#Hierarchical clustering(complete linkage works better)
hc1 <- hclust(dist(Data6[,2:3]), method = "complete")
dd1 <- as.dendrogram(hc1)
dd1 <- color_branches(dd1, k=2)
#dd1 <- set(dd1, "labels_cex", 0.3)
plot(dd1)
hc1_Class <- cutree(hc1,2)
#Confusion matrix - Hierarchical
table(Data6$Class,hc1_Class)
hc1_Class
1 2
1 1631 417
2 2044 4
#Plotting the original class
plot1<-plot_ly(x=kmf$X1, y=kmf$X2, mode="markers", color=as.factor(Data6$Class)) %>%
layout(title = 'Plot with data points colored based on original class')
plot1
#Plotting the K-means class
plot2<-plot_ly(x=kmf$X1, y=kmf$X2, mode="markers", color=as.factor(kmf$Class)) %>%
layout(title = 'Plot with data points colored based on K-Means output')
plot2
#Plotting the Hierarchical class
plot2<-plot_ly(x=kmf$X1, y=kmf$X2, mode="markers", color=as.factor(hc1_Class)) %>%
layout(title = 'Plot with data points colored based on Hierarchical output')
plot2
#External Validations
kmeans_validation <- external_validation(Data6$Class,kmf$Class,method = "adjusted_rand_index",
summary_stats = TRUE)
----------------------------------------
purity : 0.9514
entropy : 0.2691
normalized mutual information : 0.7296
variation of information : 0.5403
normalized var. of information : 0.4257
----------------------------------------
specificity : 0.9062
sensitivity : 0.9089
precision : 0.9064
recall : 0.9089
F-measure : 0.9076
----------------------------------------
accuracy OR rand-index : 0.9075
adjusted-rand-index : 0.8151
jaccard-index : 0.8309
fowlkes-mallows-index : 0.9076
mirkin-metric : 1551006
----------------------------------------
hierarchical_validation <- external_validation(Data6$Class,hc1_Class,method = "adjusted_rand_index",
summary_stats = TRUE)
----------------------------------------
purity : 0.6008
entropy : 0.3747
normalized mutual information : 0.1394
variation of information : 1.2717
normalized var. of information : 0.9251
----------------------------------------
specificity : 0.2048
sensitivity : 0.8358
precision : 0.5123
recall : 0.8358
F-measure : 0.6353
----------------------------------------
accuracy OR rand-index : 0.5202
adjusted-rand-index : 0.0406
jaccard-index : 0.4655
fowlkes-mallows-index : 0.6544
mirkin-metric : 8047470
----------------------------------------
kmeans_validation
[1] 0.8150606
hierarchical_validation
[1] 0.04058039
Data7 <- read_csv("Data7.csv")
#K-means clustering
km1 <- kmeans(Data7[,2:3], 6, nstart=30)
kmf <- cbind(Data7[,2:3],Class=km1$cluster)
#Confusion matrix - Kmeans
table(Data7$Class,kmf$Class)
1 2 3 4 5 6
1 395 0 0 0 0 0
2 0 71 76 55 86 75
3 0 0 0 0 0 3
4 0 0 0 0 3 0
5 0 0 3 0 0 0
6 0 0 0 3 0 0
#Hierarchical clustering
hc1 <- hclust(dist(Data7[,2:3]), method = "single")
dd1 <- as.dendrogram(hc1)
dd1 <- color_branches(dd1, k=6)
#dd1 <- set(dd1, "labels_cex", 0.3)
plot(dd1)
hc1_Class <- cutree(hc1,6)
#Confusion matrix - Hierarchical
table(Data7$Class,hc1_Class)
hc1_Class
1 2 3 4 5 6
1 0 0 0 0 395 0
2 0 0 0 0 0 363
3 0 0 0 3 0 0
4 0 3 0 0 0 0
5 3 0 0 0 0 0
6 0 0 3 0 0 0
#Plotting the original class
plot1<-plot_ly(x=kmf$X1, y=kmf$X2, color=as.factor(Data7$Class)) %>%
layout(title = 'Plot with data points colored based on original class')
plot1
#Plotting the K-means class
plot2<-plot_ly(x=kmf$X1, y=kmf$X2, color=as.factor(kmf$Class)) %>%
layout(title = 'Plot with data points colored based on K-Means output')
plot2
#Plotting the Hierarchical class
plot2<-plot_ly(x=kmf$X1, y=kmf$X2, mode="markers", color=as.factor(hc1_Class)) %>%
layout(title = 'Plot with data points colored based on Hierarchical output')
plot2
#External Validations
kmeans_validation <- external_validation(Data7$Class,kmf$Class,method = "adjusted_rand_index",
summary_stats = TRUE)
----------------------------------------
purity : 0.9844
entropy : 0.4208
normalized mutual information : 0.6367
variation of information : 1.1822
normalized var. of information : 0.533
----------------------------------------
specificity : 0.9943
sensitivity : 0.6346
precision : 0.9905
recall : 0.6346
F-measure : 0.7735
----------------------------------------
accuracy OR rand-index : 0.8199
adjusted-rand-index : 0.6355
jaccard-index : 0.6307
fowlkes-mallows-index : 0.7928
mirkin-metric : 106658
----------------------------------------
hierarchical_validation <- external_validation(Data7$Class,hc1_Class,method = "adjusted_rand_index",
summary_stats = TRUE)
----------------------------------------
purity : 1
entropy : 0
normalized mutual information : 1
variation of information : 0
normalized var. of information : 0
----------------------------------------
specificity : 1
sensitivity : 1
precision : 1
recall : 1
F-measure : 1
----------------------------------------
accuracy OR rand-index : 1
adjusted-rand-index : 1
jaccard-index : 1
fowlkes-mallows-index : 1
mirkin-metric : 0
----------------------------------------
kmeans_validation
[1] 0.6355487
hierarchical_validation
[1] 1
Data8 <- read_csv("Data8.csv")
#K-means clustering
km1 <- kmeans(Data8[,2:4], 1, nstart=30)
kmf <- cbind(Data8[,2:4],Class=km1$cluster)
#Confusion matrix - Kmeans
table(Data8$Class,kmf$Class)
1
1 4002
#Hierarchical clustering
hc1 <- hclust(dist(Data8[,2:4]), method = "single")
dd1 <- as.dendrogram(hc1)
dd1 <- color_branches(dd1, k=1)
#dd1 <- set(dd1, "labels_cex", 0.3)
plot(dd1)
hc1_Class <- cutree(hc1,1)
#Confusion matrix - Hierarchical
table(Data8$Class,hc1_Class)
hc1_Class
1
1 4002
#Plotting the original class
plot1<-plot_ly(x=kmf$X1, y=kmf$X2, z=kmf$X3, type="scatter3d", mode="markers", color=as.factor(Data8$Class)) %>%
layout(title = 'Plot with data points colored based on original class')
plot1
#Plotting the K-means class
plot2<-plot_ly(x=kmf$X1, y=kmf$X2, z=kmf$X3, type="scatter3d", mode="markers", color=as.factor(kmf$Class)) %>%
layout(title = 'Plot with data points colored based on K-Means output')
plot2
#Plotting the Hierarchical class
plot2<-plot_ly(x=kmf$X1, y=kmf$X2, z=kmf$X3, type="scatter3d", mode="markers", color=as.factor(hc1_Class)) %>%
layout(title = 'Plot with data points colored based on Hierarchical output')
plot2
#External Validations
kmeans_validation <- external_validation(Data8$Class,kmf$Class,method = "adjusted_rand_index",
summary_stats = TRUE)
----------------------------------------
purity : 1
entropy : NaN
normalized mutual information : 1
variation of information : 0
normalized var. of information : 1
----------------------------------------
specificity : NaN
sensitivity : 1
precision : 1
recall : 1
F-measure : 1
----------------------------------------
accuracy OR rand-index : 1
adjusted-rand-index : NaN
jaccard-index : 1
fowlkes-mallows-index : 1
mirkin-metric : 0
----------------------------------------
hierarchical_validation <- external_validation(Data8$Class,hc1_Class,method = "adjusted_rand_index",
summary_stats = TRUE)
----------------------------------------
purity : 1
entropy : NaN
normalized mutual information : 1
variation of information : 0
normalized var. of information : 1
----------------------------------------
specificity : NaN
sensitivity : 1
precision : 1
recall : 1
F-measure : 1
----------------------------------------
accuracy OR rand-index : 1
adjusted-rand-index : NaN
jaccard-index : 1
fowlkes-mallows-index : 1
mirkin-metric : 0
----------------------------------------
kmeans_validation
[1] NaN
hierarchical_validation
[1] NaN
#Task-2
world_i1 <- read_csv("World Indicators.csv")
world_i1$`Business Tax Rate` <- gsub("%","",world_i1$`Business Tax Rate`)
world_i1$`Health Exp/Capita` <- substr(world_i1$`Health Exp/Capita`,2,length(world_i1$`Health Exp/Capita`))
world_i1$GDP <- substr(world_i1$GDP,2,length(world_i1$GDP))
world_i1$GDP <- gsub(",","",world_i1$GDP)
world_i1$`Health Exp/Capita` <- gsub(",","",world_i1$`Health Exp/Capita`)
world_i1$`Business Tax Rate` <- as.numeric(world_i1$`Business Tax Rate`)
world_i1$`Health Exp/Capita` <- as.numeric(world_i1$`Health Exp/Capita`)
world_i1$GDP <- as.numeric(world_i1$GDP)
#Removing the two columns with maximum number of null values and scaling
World_i2 <- world_i1[,-c(4,11)] %>%
drop_na()
df <- data.frame(scale(World_i2[,1:16]))
#Finding the optimal number of clusters
#Elbow method
fviz_nbclust(df[,1:16] , kmeans , method = 'wss')
#Silhoutte method
fviz_nbclust(df[,1:16] , kmeans , method = 'silhouette')
#Finding distance
Dist_KM <- get_dist(df)
#Visualizing the distance values
fviz_dist(Dist_KM)
#As both the methods show around 2 as the optimal number, we cluster them into two classes below
km <- kmeans(df[,1:16] , 2 , nstart = 20)
#Visualizing the cluster from k-means
fviz_cluster(km , data = df[,1:16])
km$cluster
[1] 1 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 2 2 2 2 2 2 2 1 2 2 2 2 2 2 1 2 2 2 1 1 2 2 1 2
[57] 1 1 2 2 1 1 1 2 2 1 1 2 2 2 1 1 2 1 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
[113] 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 2 1 1 2 2 1 2 2 2 2 1 1 1 1 2 2 1 1 1 1 1 1 1 1 1 2 2 2 2 1 1 2 1 1 1 1 1 2 1
[169] 1 1 1
df$KM <- km$cluster
World_i2$KM <- km$cluster
#Internal validation for k-means - validated using dunn index
dunn_km <- dunn(clusters = df$KM , Data = df[,1:16])
dunn_km
[1] 0.06697056
km$centers
Birth.Rate Business.Tax.Rate Days.to.Start.Business GDP Health.Exp...GDP Health.Exp.Capita Hours.to.do.Tax
1 -0.7087267 -0.1429606 -0.1949461 0.1732969 0.1399650 0.3796016 -0.04960754
2 0.9514413 0.1919197 0.2617084 -0.2326451 -0.1878983 -0.5096021 0.06659642
Infant.Mortality.Rate Internet.Usage Life.Expectancy.Female Life.Expectancy.Male Mobile.Phone.Usage
1 -0.6904214 0.6867375 0.6891531 0.6674966 0.600936
2 0.9268671 -0.9219216 -0.9251645 -0.8960913 -0.806736
Population.0.14 Population.15.64 Population.65. Population.Urban
1 -0.7377098 0.6860028 0.5746374 0.5496109
2 0.9903501 -0.9209353 -0.7714310 -0.7378338
#Three classes
km <- kmeans(df[,1:16] , 3 , nstart = 20)
#Visualizing the cluster from k-means
fviz_cluster(km , data = df[,1:16])
km$cluster
[1] 2 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 1 1 1 1 1 1 1 2 1 2 1 1 1 1 2 1 1 1 2 2 1 2 2 1
[57] 2 2 1 2 3 2 3 2 1 2 2 1 1 2 2 2 1 2 1 2 2 2 3 2 3 2 2 2 2 2 3 2 3 3 3 3 2 3 3 3 2 2 3 2 3 2 2 3 3 2 3 2 2 2 2 3
[113] 3 3 3 2 2 3 2 2 1 2 2 2 2 2 2 2 2 1 3 2 1 1 3 1 1 1 1 2 2 2 2 2 1 2 3 2 2 2 2 2 2 2 1 1 1 2 2 2 2 2 2 2 2 2 2 2
[169] 3 2 2
#Internal validation for k-means - validated using dunn index
dunn_km <- dunn(clusters = df$KM , Data = df[,1:16])
dunn_km
[1] 0.06697056
km$centers
Birth.Rate Business.Tax.Rate Days.to.Start.Business GDP Health.Exp...GDP Health.Exp.Capita Hours.to.do.Tax
1 1.0977304 0.2220355 0.125770071 -0.24189747 -0.1675895 -0.5174629 0.12191236
2 -0.5187961 -0.1321157 0.007281763 -0.08747886 -0.2478830 -0.2458843 0.05915505
3 -1.0236822 -0.1213366 -0.327716114 0.86203103 1.1878672 2.0293338 -0.48196895
Infant.Mortality.Rate Internet.Usage Life.Expectancy.Female Life.Expectancy.Male Mobile.Phone.Usage
1 1.0827672 -0.9881109 -1.0659632 -1.0183934 -0.9344346
2 -0.5315755 0.2826238 0.4572462 0.3817182 0.5353050
3 -0.9471209 1.5029167 1.1408266 1.2637652 0.5759372
Population.0.14 Population.15.64 Population.65. Population.Urban
1 1.1086122 -1.0757826 -0.8045752 -0.8069778
2 -0.4960104 0.6698899 0.1124584 0.2877398
3 -1.1219121 0.4939742 1.5948711 1.0478823
#Dunn index yields a larger value when k=2 for k-means
#Hierarchical clustering with "single" linkage
labs <- World_i2$Country[1:171]
hc.single <- hclust(dist(df[,1:16]) , method = "single")
dd <- as.dendrogram(hc.single)
dd <- color_branches(dd, k=2)
plot(dd)
plot(hc.single, labels = labs)
#Hierarchical clustering with "average" linkage
hc.average <- hclust(dist(df[,1:16]) , method = "average")
dd <- as.dendrogram(hc.average)
dd <- color_branches(dd, k=2)
plot(dd)
plot(hc.average, labels = labs)
#Hierarchical clustering with "complete" linkage
hc.complete <- hclust(dist(df[,1:16]) , method = "complete")
dd <- as.dendrogram(hc.complete)
dd <- color_branches(dd, k=2)
plot(dd)
plot(hc.complete, labels = labs)
#Internal validation for Hierarchical clustering
#Two clusters
Dist <- dist(df[,1:16] , method = 'euclidean')
nc_1 <- 2
hc_cluster1 <- cutree(hc.complete,nc_1)
dunn(Dist, hc_cluster1)
[1] 0.5198649
df$hc_cluster <- hc_cluster1
World_i2$hc_cluster <- hc_cluster1
#Three clusters
Dist <- dist(df[,1:16] , method = 'euclidean')
nc_1 <- 3
hc_cluster1 <- cutree(hc.complete,nc_1)
dunn(Dist, hc_cluster1)
[1] 0.564507
#Dunn index yields a larger value when k=3 for hierarchical clustering
#Overall, the best result is obtained with hierarchical clustering with k=3 (validated using dunn index). Hence using the same to categorize the countries
df_cluster_1 <- subset(World_i2 , hc_cluster1 == 1)
df_cluster_2 <- subset(World_i2 , hc_cluster1 == 2)
df_cluster_3 <- subset(World_i2 , hc_cluster1 == 3)
#Countries in each group
c1 <- df_cluster_1$Country
c2 <- df_cluster_2$Country
c3 <- df_cluster_3$Country
internal_validation <- clValid(df[,1:16],2:7,clMethods=c("hierarchical","kmeans"),validation="internal")
summary(internal_validation)
Clustering Methods:
hierarchical kmeans
Cluster sizes:
2 3 4 5 6 7
Validation Measures:
2 3 4 5 6 7
hierarchical Connectivity 2.9290 5.8579 8.7869 13.3238 31.6563 34.5853
Dunn 0.5199 0.5645 0.5167 0.3359 0.1953 0.1953
Silhouette 0.5979 0.5731 0.5168 0.4223 0.3572 0.3160
kmeans Connectivity 29.0286 30.6075 43.5087 46.6631 40.0024 52.5012
Dunn 0.0670 0.0705 0.0687 0.0713 0.1267 0.1100
Silhouette 0.3611 0.3639 0.2991 0.2891 0.3716 0.3049
Optimal Scores:
plot(internal_validation)
#Plotting
#We have decided to find the correlation between each column and use the columns that have large correlation value for ploting and colored them based on clasees
#correlation function
cor(World_i2[,1:16])
Birth Rate Business Tax Rate Days to Start Business GDP Health Exp % GDP
Birth Rate 1.0000000 0.25832327 0.10627023 -0.23384759 -0.221273544
Business Tax Rate 0.2583233 1.00000000 0.02645857 0.02543613 -0.078449904
Days to Start Business 0.1062702 0.02645857 1.00000000 -0.04970137 -0.148395627
GDP -0.2338476 0.02543613 -0.04970137 1.00000000 0.338555911
Health Exp % GDP -0.2212735 -0.07844990 -0.14839563 0.33855591 1.000000000
Health Exp/Capita -0.4885708 -0.09342338 -0.13000616 0.43195417 0.494636086
Hours to do Tax 0.1139401 0.15201992 0.14719366 0.04531425 -0.082171525
Infant Mortality Rate 0.8728444 0.25331039 0.11504030 -0.19652001 -0.142575915
Internet Usage -0.8096124 -0.19386059 -0.17351210 0.25588950 0.281448992
Life Expectancy Female -0.8683710 -0.20693869 -0.12855991 0.21897245 0.169783887
Life Expectancy Male -0.8375751 -0.23108486 -0.14802986 0.23636298 0.190877771
Mobile Phone Usage -0.6706084 -0.22394032 -0.08859167 0.04812196 -0.014945233
Population 0-14 0.9642443 0.20842064 0.14390213 -0.24386520 -0.225838389
Population 15-64 -0.8777268 -0.24823919 -0.09475062 0.15272895 0.001004394
Population 65+ -0.7758136 -0.09118757 -0.16290533 0.28595966 0.449373106
Population Urban -0.5889220 -0.09274097 -0.05135881 0.22205852 0.162976604
Health Exp/Capita Hours to do Tax Infant Mortality Rate Internet Usage Life Expectancy Female
Birth Rate -0.48857076 0.11394011 0.8728444 -0.8096124 -0.8683710
Business Tax Rate -0.09342338 0.15201992 0.2533104 -0.1938606 -0.2069387
Days to Start Business -0.13000616 0.14719366 0.1150403 -0.1735121 -0.1285599
GDP 0.43195417 0.04531425 -0.1965200 0.2558895 0.2189724
Health Exp % GDP 0.49463609 -0.08217153 -0.1425759 0.2814490 0.1697839
Health Exp/Capita 1.00000000 -0.21585274 -0.4646887 0.7232246 0.5250518
Hours to do Tax -0.21585274 1.00000000 0.1677532 -0.1782408 -0.1365328
Infant Mortality Rate -0.46468871 0.16775316 1.0000000 -0.7827817 -0.9261500
Internet Usage 0.72322459 -0.17824084 -0.7827817 1.0000000 0.7994866
Life Expectancy Female 0.52505178 -0.13653281 -0.9261500 0.7994866 1.0000000
Life Expectancy Male 0.57618819 -0.18072133 -0.9021864 0.8120771 0.9751152
Mobile Phone Usage 0.33115351 -0.03731317 -0.6903296 0.6486685 0.6410972
Population 0-14 -0.52558017 0.12541678 0.8393445 -0.8446817 -0.8381414
Population 15-64 0.30542898 -0.13468736 -0.7715820 0.7128263 0.7428294
Population 65+ 0.64962746 -0.07431018 -0.6654919 0.7531107 0.7009784
Population Urban 0.52094854 0.00397962 -0.5837639 0.6884567 0.6147325
Life Expectancy Male Mobile Phone Usage Population 0-14 Population 15-64 Population 65+
Birth Rate -0.8375751 -0.67060841 0.9642443 -0.877726752 -0.77581357
Business Tax Rate -0.2310849 -0.22394032 0.2084206 -0.248239186 -0.09118757
Days to Start Business -0.1480299 -0.08859167 0.1439021 -0.094750616 -0.16290533
GDP 0.2363630 0.04812196 -0.2438652 0.152728948 0.28595966
Health Exp % GDP 0.1908778 -0.01494523 -0.2258384 0.001004394 0.44937311
Health Exp/Capita 0.5761882 0.33115351 -0.5255802 0.305428983 0.64962746
Hours to do Tax -0.1807213 -0.03731317 0.1254168 -0.134687360 -0.07431018
Infant Mortality Rate -0.9021864 -0.69032956 0.8393445 -0.771582041 -0.66549191
Internet Usage 0.8120771 0.64866852 -0.8446817 0.712826274 0.75311072
Life Expectancy Female 0.9751152 0.64109715 -0.8381414 0.742829375 0.70097835
Life Expectancy Male 1.0000000 0.60645028 -0.8031596 0.721028735 0.65980935
Mobile Phone Usage 0.6064503 1.00000000 -0.6832376 0.655548188 0.50566329
Population 0-14 -0.8031596 -0.68323759 1.0000000 -0.899067027 -0.81912396
Population 15-64 0.7210287 0.65554819 -0.8990670 1.000000000 0.48534210
Population 65+ 0.6598094 0.50566329 -0.8191240 0.485342103 1.00000000
Population Urban 0.6292576 0.57532418 -0.6211391 0.564644276 0.50060369
Population Urban
Birth Rate -0.58892197
Business Tax Rate -0.09274097
Days to Start Business -0.05135881
GDP 0.22205852
Health Exp % GDP 0.16297660
Health Exp/Capita 0.52094854
Hours to do Tax 0.00397962
Infant Mortality Rate -0.58376391
Internet Usage 0.68845669
Life Expectancy Female 0.61473249
Life Expectancy Male 0.62925757
Mobile Phone Usage 0.57532418
Population 0-14 -0.62113912
Population 15-64 0.56464428
Population 65+ 0.50060369
Population Urban 1.00000000
#Plot1 #Population 0-14 and birthrate seems to have large correlation
plot1<-plot_ly(x=World_i2$`Birth Rate`, y=World_i2$`Population 0-14`, mode="markers", color=as.factor(World_i2$hc_cluster)) %>%
layout(title = 'Birthrate vs Population 0-14')
plot1
#Plot2 #Life expectancy female and birthrate seems to have a correlation
plot2<-plot_ly(x=World_i2$`Birth Rate`, y=World_i2$`Life Expectancy Female`, mode="markers", color=as.factor(World_i2$hc_cluster)) %>%
layout(title = 'Birthrate vs Life expectancy female')
plot2
#Plot3 #Health Exp/Capita and internet usage also looks like there is a correlation
plot3<-plot_ly(x=World_i2$`Health Exp/Capita`, y=World_i2$`Internet Usage`, mode="markers", color=as.factor(World_i2$hc_cluster)) %>%
layout(title = 'Health Exp/Capita vs Internet usage')
plot3